*---------------------------------------------------
cap clear mata
cap clear
capture log close
program drop _all
macro drop _all
version 11
set mem 700m
set mat 2000
set more off
cd "E:\REStat_MS14767_Vol96(2)\Data preparation Compustat segment"
log using "12_estimation_sample.log", replace
*---------------------------------------------------------------


********************************************************************************
********                                                	      **************
********      in this file we create the final estimation sample  **************
********                                                		**************
********************************************************************************

use "segment_compustatvars.dta", clear
so ticker year
********************************************************
*****          here we add the variables related to RJV participation (see RJVpartic.do)
********************************************************

merge 1:m ticker year SIC using "RJVvars.dta"
tab _merge
drop if _merge == 2
drop _merge

********************************************************
*****          here we add the variables related to links(see links.do)
********************************************************

so ticker year
merge 1:1 ticker year SIC using links.dta
tab _merge
drop _merge


********************************************************
*****           here we add the variables related to patents (see patentvars.do)
********************************************************

so cusip year
merge 1:1 cusip year SIC using "patentvars.dta"
tab _merge
drop _merge

************************************************************
*** 		some other patent vars are added (see patentpanel.do)
************************************************************


merge m:1 ticker year using "patentpanel.dta"
tab _merge
drop if _merge==2
drop _merge

************************************************************
*** Remark: Don't care about observations with _merge==2,
*** they are due to different time intervals, not to different
*** firms. That is, patent data starts already in 1969 whereas
*** Compustat starts in 1986. 
**************************************************************


/*		**********************************************************
		*****                VERY IMPORTANT STEP             *****
		*****           WE KEEP ONLY AMERICAN FIRMS          *****
		*****             FOR WHICH WE HAVE SALES            *****
		**********************************************************

count if countryinc!=0

* sales are changed into sales_new by substracting forn_sales

replace forn_sales=0 if  forn_sales==.
gen sales_new=sales- forn_sales
count if sales_new<0
replace sales_new=. if sales_new<0
count if sales_new==.
drop if sales_new==.
label var sales_new "Sales - foreign sales"

compress
drop countryinc vantagekey sales forn_sales

*/

********************* we correct some variables  ****************
********************* and generate some others     ****************

so ticker year SIC
egen firmnum=group(ticker)
label var firmnum "group(ticker)"
* this is just to control that there are no mistake in the panel
* if everythnig is correct no observation should be dropped!!
count if firmnum==firmnum[_n-1]& year==year[_n-1] & SIC ==SIC[_n-1]
*0



********************************************************
*****           
*****           NETWORK
*****           
********************************************************



replace  links1_tot=0 if  links1_tot==.
replace  links2_tot=0 if  links2_tot==.
replace links1=0 if links1==.
replace links2=0 if links2==.
replace links2_same=0 if links2_same==.
replace links2_diff=0 if links2_diff==.

gen links1_ver= links1_tot-links1
label var links1_ver "Pure vertical links (i.e. links with non-competitors) with double counting"
gen links2_ver= links2_tot-links2
label var links2_ver "Pure vertical links (i.e. links with non-competitors) without double counting"


gen network=links2/(ncomp-1)
label var network "links2/competitors"

* we have to make a small correction since for some obs. the number of links is larger than the number of competitors (# of firms -1)
* this is because we have 11 firms in the RJV that have no sales and we dropped these observations
* these firms are: DEC	DGN.	FMC	GCHI	HLN	HPQ	IDL.1	LCE	MSA.1	NSTS	SDW

replace network=1 if links2>ncomp-1
replace network=0 if network==.

gen network2=network*network
label var network2 "network squared"


gen network_ver=links2_ver/(ncomp-1)
label var network "links2_ver/competitors"

gen network_ver2=network_ver*network_ver
label var network "network_ver squared"


		**** dividing between RJV in the same SIC industry 
		**** and in different SIC industry

gen network_s=links2_same/(ncomp-1)
label var network_s "same ind. links2/competitors"

* we have to make a small correction since for some obs. the number of links is larger than the number of competitors (# of firms -1)

replace network_s=1 if links2_same>ncomp-1
replace network_s=0 if network_s==.

gen network_s2=network_s*network_s
label var network_s2 "network_s squared"

gen network_d=links2_diff/(ncomp-1)
label var network_d "diff. ind. links2/competitors"

* we have to make a small correction since for some obs. the number of links is larger than the number of competitors (# of firms -1)
replace network_d=1 if links2_diff>ncomp-1
replace network_d=0 if network_d==.

gen network_d2=network_d*network_d
label var network_d2 "network_d squared"


********************************************************
*****           
*****           INDUSTRY NETWORK
*****           
********************************************************


* here we generate the ind_links which is the aggregation at the industry level

* sum of all links
egen sumlinks=sum(links2), by(SIC year)
label var sumlinks "sum of all links2"

gen ind_network=0
replace ind_network=2*sumlinks/((ncomp-1)*ncomp) if links2>0
label var ind_network "2*sumlinks/((ncomp-1)*ncomp)"

* also here we make a correction
* replace ind_network=1 if ind_net>1 & ind_network!=.

gen ind_network2=ind_network*ind_network
label var ind_network2 "ind_network squared"



********************************************************
*****           
*****           INDUSTRY NETWORK-same
*****           
********************************************************


* here we generate the ind_links which is the aggregation at the industry level

* sum of all links
egen sumlinks_s=sum(links2_same), by(SIC year)
label var sumlinks "sum of all links2_same"

gen ind_network_s=0
replace ind_network_s=2*sumlinks_s/((ncomp-1)*ncomp) if links2>0
label var ind_network_s "2*sumlinks_s/((ncomp-1)*ncomp)"

* also here we make a correction
* replace ind_network_s=1 if ind_network_s>1 & ind_network_s!=.

gen ind_network_s2=ind_network_s*ind_network_s
label var ind_network_s2 "ind_network_s squared"



********************************************************
*****           
*****           INDUSTRY NETWORK-different
*****           
********************************************************


* here we generate the ind_links which is the aggregation at the industry level

* sum of all links
egen sumlinks_d=sum(links2_diff), by(SIC year)
label var sumlinks "sum of all links2_diff"

gen ind_network_d=0
replace ind_network_d=2*sumlinks_d/((ncomp-1)*ncomp) if links2_diff>0
label var ind_network_d "2*sumlinks_d/((ncomp-1)*ncomp)"

* also here we make a correction
* replace ind_network_d=1 if ind_network_d>1 & ind_network_d!=.

gen ind_network_d2=ind_network_d*ind_network_d
label var ind_network_d2 "ind_network_d squared"



********************************************************
*****           
*****           RJV in same/different industry
*****           
********************************************************

drop  RJVsame RJVdiff
gen RJVsame=0
replace RJVsame=1 if links2>0
label var RJVsame "=1 if the firm meets at least one competitior in the RJV(s)"

gen RJVdiff=0
replace RJVdiff= RJV-RJVsame if RJV>0
label var RJVdiff "=1 if the firm does not meet any competitior in the RJV(s)"

gen lRJVtotal=log(RJVtotal+1)
label var lRJVtotal "log(RJVtotal+1)"

desc

use estimation_sample_segment.dta, clear
rename SIC SIC4
egen obs_num=group(ticker SIC4)
rename sales sales_new
*so ticker year

gen lta=log(ta_new+1)
label var lta "Log total assents in millions dollars"


* we drop one industry (59 observations + 1 obs) which has a wierd pattern

gen drop=0
/*
replace drop=1 if SIC4==3241
replace drop=1 if network==1
*drop if drop==1
*/

* here we first define those industries where no RJV was formed
* these observations will represent the "zeros"

gen aa=RJV
egen bb=max(aa), by(SIC4)
replace bb=0 if bb==.
gen zeros=0
replace zeros=1 if bb==0
drop aa bb


gen aa=RJVsame
egen bb=max(aa), by(SIC4)
gen zero_same=0
replace zero_same=1 if bb==0
drop aa bb


replace patents=0 if patents==.
replace patents_t_1=0 if patents_t_1==.
replace patents_t_2=0 if patents_t_2==.
replace patents_t_3=0 if patents_t_3==.
replace patents_t_4=0 if patents_t_4==.


gen only_ver=0
replace only_ver=1 if network==0 & RJV==1

egen m_rdexpense=mean(rd_corr), by(SIC4 year)
label var m_rdexpense"Mean R&D exp. at the SIC4/year level"

egen m_lta=mean(lta), by(SIC4 year)
label var m_lta "Mean log total asset at the SIC4/year level

* here we redefine the RJV dummies

gen RJV2=RJV
replace RJV2=0 if RJV2==.


* horizontal vs. vertical

gen RJVsame2=RJVsame
replace RJVsame2=0 if RJVsame2==.
rename RJVsame2 RJVhor
label var RJVhor "RJV with firms from the same industry"


gen RJVdiff2=RJVdiff
replace RJVdiff2=0 if RJVdiff==.
rename RJVdiff2 RJVver
label var RJVver "RJV without firms from the same industry"

* horizontal in the same industry vs. different industry


gen RJVsame2=0
replace RJVsame2=1 if network_s>0
label var RJVsame2 "RJV with competitors in the same industry"

gen RJVdiff2=0
replace RJVdiff2=1 if RJVsame2==0 & RJVhor==1
label var RJVsame2 "RJV with competitors in different industries"


* we replace the missing with zeros in the r&d variable 

gen rd2=rd_corr
replace rd2=0 if rd2==. | rd2<0
gen lrd2=log(rd2+1)

gen ta=ta_corr
replace ta=0 if ta==. 
replace lta=log(ta+1)

* we create a R&D intensity variable
gen rd_int=rd2/sales_new


* we generate industry specific variables

egen m_rd=mean(rd2), by(SIC4 year)
label var m_rd "Mean R&D exp. at the SIC4/year level

egen m_lrd=mean(lrd2), by(SIC4 year)
label var m_rd "Mean log R&D exp. at the SIC4/year level

* we generate other coverage variables

gen only_horiz=0
replace only_horiz=1 if network>0

egen nRJVfirm_hor=sum(only_horiz), by(SIC4 year)
gen coverage= nRJVfirm_hor/ncom
replace coverage=0 if network==0
gen coverage2=coverage*coverage


gen only_horiz_s=0
replace only_horiz_s=1 if network_s>0


*** we generate the coverage variables

egen nRJVfirm_hor_s=sum(only_horiz_s), by(SIC4 year)
gen coverage_s= nRJVfirm_hor_s/ncom
replace coverage_s=0 if network_s==0
gen coverage_s2=coverage_s*coverage_s

egen coverageMS=sum(MS) if coverage>0, by(SIC4 year)
replace coverageMS=0 if network==0
gen coverageMS2=coverageMS*coverageMS


*** we generate the vertical links

replace links2_tot=0 if links2_tot==.
gen links_ver=links2_ver

*label var RJVeff2 "Heterogenous effect (RJV dummies)"
*label var RJVeff_s2 "Heterogenous effect (RJV dummies)"
label var network "RJV network coverage"
label var network_s "RJV network coverage"
label var network_d "RJV network coverage"
label var RJV2 "RJV"
label var RJVver "Vertical RJV"
label var RJVhor "Horizontal RJV"
label var links_ver "Vertical links"
label var lrd2 "log(R&D)"
label var m_lrd "log(R&D) SIC4"

gen RJVhor_s=0
replace RJVhor_s=1 if network>0 & network<.0267857

gen RJVhor_m=0
replace RJVhor_m=1 if network>=.0267857 & network<.13

gen RJVhor_l=0
replace RJVhor_l=1 if network>=.13

gen pat_corr=patents*sales_prop
gen networkMS=links_MS2_tot


*****************************************************
*** here we define a firm to be in vertical networks if all its formed pair in 1997
*** are with other firms with which the closeness measure in the product space is zero
*** i.e. the intersection of the SIC4 vectors is empty
*****************************************************

so ticker year
merge ticker using "RJVver_sure97"
tab _merge
drop if _merge==2

xtset obs_num year
rename  RJVver_sure  RJVver_sure97
gen  RJVver_sure= RJVver_sure97
replace  RJVver_sure=0 if  RJVver_sure==.
replace RJVver_sure=0 if RJV2==0
gen RJVver_nsure=RJVver
replace RJVver_nsure=0 if RJVver_sure==1

tab RJVver_sure RJVhor if zero_same==0

* here we observe that we have 626 observations which are horizontal for our databse
* and are assigned ot be vertical in 1997

gen inconsistent=0
replace inconsistent=1 if RJVhor==1 & RJVver_sure==1
tab year inconsistent

* we can assign the inconsistent cases to horizontal
* i.e we define RJVver_sure2 which is 0 for inconistent cases 

gen  RJVver_sure2= RJVver_sure
replace  RJVver_sure2=0 if  inconsistent==1

gen RJVver_nsure2=RJVver
replace RJVver_nsure2=0 if RJVver_sure2==1

keep MS sales_new ta rd2 rd_int patents pat_corr network networkMS coverage RJVsametot links2_ver m_rdexpense m_lta ncomp RJV2 zero_same  dy* ticker RJVver RJVhor links_ver-RJVver_nsure2 obs_num year drop lrd2 m_lrd patents* network2
save RESTATestimation_sample_segment.dta, replace

log close
